Tech Arsenal 1

home *** CD-ROM | disk | FTP | other *** search

/ Tech Arsenal 1 / Tech Arsenal (Arsenal Computer).ISO / tek-01 / us20src.zip / BIC.C < prev next >

Wrap

C/C++ Source or Header | 1992-06-27 | 11KB | 578 lines

/* BIC: Bickel's Algorithm test program Spell Checker and Corrector (C)opyright May 1990, 1991 by Daniel Lawrence All Rights Reserved Revision History: 29-mar-91 Trying some filters to improve this */ #define maindef 1 #include <stdio.h> #include <ctype.h> #include "dopt.h" #include "dstruct.h" #include "ddef.h" #define MWSIZE 40 /* maximum # chars in a word */ /* letter weighing table */ char lweight[26] = { /* a */ 3, /* b */ 6, /* c */ 5, /* d */ 4, /* e */ 3, /* f */ 5, /* g */ 5, /* h */ 4, /* i */ 3, /* j */ 8, /* k */ 7, /* l */ 4, /* m */ 5, /* n */ 3, /* o */ 3, /* p */ 5, /* q */ 7, /* r */ 4, /* s */ 3, /* t */ 3, /* u */ 4, /* v */ 6, /* w */ 5, /* x */ 8, /* y */ 8, /* z */ 9 }; /* some limits */ #define MAXMATCHES 50 /* local prototypes */ #if TURBO char *cleanword(char *word); char *sortword(char *word); char *intersect(char *word1, char *word2); int weight(char *cletters); int mismatch(char *, char *); #else char *cleanword(); char *sortword(); char *intersect(); int weight(); int mismatch(); #endif /* GLOBALS for BIC */ char testwrd[MWSIZE]; /* word to test against dictionary */ char sortwrd[MWSIZE]; /* sorted letters of testwrd */ char dictwrd[MWSIZE]; /* dictionary word to compare */ int likeness; /* likeness score of current word */ int threshold; /* threshold to display likeness */ int max_matches; /* maximum number of matches */ char matches[MAXMATCHES][128]; /* list of current matches */ int match_scores[MAXMATCHES]; /* difference scores for these */ int swterse = FALSE; /* called from EMACS? */ int swverbose = FALSE; /* display extra info? */ int exact_match = FALSE; /* is this word in the dictionary? */ main(argc, argv) int argc; /* command line argument count */ char **argv; /* argument vector */ { register int dif_value; /* difference value of current word */ register char *sortptr; /* ptr to the sorted cleaned dictionary word */ register char first_char; /* first character in word to test */ max_matches = 12; /* default list length! */ if (argc < 2) { usage(); exit(EXBADOPT); } /* deal with the command options */ while (argc > 1) { /* grab the next option */ --argc; argv++; strcpy(testwrd, argv[0]); if (*testwrd == '-') { switch (testwrd[1]) { case 'm': --argc; argv++; max_matches = atoi(argv[0]); break; case 't': swterse = TRUE; break; case 'v': swverbose = TRUE; break; default: usage(); exit(EXBADOPT); } } } /* announce us */ if (swterse == FALSE) printf("BIC %s Bickel's Test Algorithm\n", VERSION); /* initialize the lcase array */ init_lcase(); /* grab the test word */ strcpy(sortwrd, sortword(cleanword(testwrd))); threshold = weight(intersect(sortwrd, sortwrd)) - 4; if (swverbose) printf("threshold = %u\n", threshold); /* initailize the list of matches */ init_match(); /* open the main dictionary */ if (mopen() == FALSE) exit(EXMDICT); first_char = lcase[*testwrd]; skipdict(first_char); strcpy(dictwrd, nxtmword()); /* scan the dictionary */ while (lcase[*dictwrd] == first_char) { /* dump them if off by more than 3 chars! */ if (abs(strlen(dictwrd) - strlen(testwrd)) < 4) { /* clean and sort the word */ sortptr = sortword(cleanword(dictwrd)); /* calculate the likeness score */ likeness = weight(intersect(sortwrd, sortptr)); /* tell us about it */ if (likeness > threshold) { /* are these nearly alike? */ if ((dif_value = mismatch(sortwrd, sortptr)) < 20) { add_match(dictwrd, dif_value); /* is this the word? */ if (strcmp(testwrd, dictwrd) == 0) { exact_match = TRUE; if (swterse) { printf("EXACT MATCH\n"); mclose(); exit(EXGOOD); } } } } } /* get a dictionary word */ nextword: strcpy(dictwrd, nxtmword()); /* at the end... skip out */ if (strcmp(dictwrd, hivalue) == 0) break; } /* close the main dictionary */ mclose(); /* show us the results */ show_match(); /* error code of EXMISS if word is not in dictionary */ if (exact_match) { exit(EXGOOD); } else { exit(EXMISS); } } usage() /* print the command line usage */ { printf("BIC: Bickel's Algorithm Test Program\n"); printf(" for MicroSPELL %s\n",VERSION); puts("\nUsage\n"); puts(" BIC {-t|-v} {-m <n>} <test word>\n"); puts(" -m <n> return top <n> suggestions / default = 12"); puts(" -t terse mode"); puts(" -v verbose mode"); } #if RAMSIZE & LATTICE & MSDOS /* These routines will allow me to track memory usage by placing a layer on top of the standard system malloc() and free() calls. with this code defined, the number of allocated bytes is displayed in the upper right corner of the screen */ #undef malloc #undef free char *allocate(nbytes) /* allocate nbytes and track */ unsigned nbytes; /* # of bytes to allocate */ { char *mp; /* ptr returned from malloc */ char *malloc(); mp = malloc(nbytes); if (mp) { envram += nbytes; #if RAMSHOW dspram(); #endif } return(mp); } release(mp) /* release malloced memory and track */ char *mp; /* chunk of RAM to release */ { unsigned *lp; /* ptr to the long containing the block size */ if (mp) { lp = ((unsigned *)mp) - 1; /* update amount of ram currently malloced */ envram -= (long)*lp - 2; free(mp); #if RAMSHOW dspram(); #endif } } #if RAMSHOW dspram() /* display the amount of RAM currently malloced */ { char mbuf[20]; char *sp; /* TTmove(term.t_nrow - 1, 70);*/ sprintf(mbuf, "[%lu]", envram); sp = &mbuf[0]; puts(sp); } #endif #endif #if AZTEC & MSDOS #undef fgetc /* a1getc: Get an ascii char from the file input stream but DO NOT strip the high bit */ int a1getc(fp) FILE *fp; { int c; /* translated character */ c = getc(fp); /* get the character */ /* if its a <LF> char, throw it out */ while (c == 10) c = getc(fp); /* if its a <RETURN> char, change it to a LF */ if (c == '\r') c = '\n'; /* if its a ^Z, its an EOF */ if (c == 26) c = EOF; return(c); } #endif #if CMS #undef fopen /* The IBM 30xx likes to tell us when file opens fail...it's too chatty....I like to handle these myself */ FILE *cmsopen(file, mode) char *file; /* name of file to open */ char *mode; /* mode to open it in */ { quiet(1); return(fopen(file,mode)); } #endif /**********************************************************************/ /* clean word: dump all the non alpha chars, and make them lowercase only */ char *cleanword(word) char *word; /* word to clean */ { register char *sp; /* ptr into return buffer */ static char rbuf[MWSIZE]; /* return buffer */ /* scan through the word */ sp = rbuf; while (*word) { if (isalpha(*word)) { if (islower(*word)) *sp++ = *word; else *sp++ = *word + 32; } ++word; } *sp = 0; return(rbuf); } /* Sort word: bubble sort the letters in a word.... average word size is 4.5 chars. Bubble sort is most efficient at that size */ char *sortword(word) char *word; /* word to sort letters in */ { register int i, j; /* bubble sort indexes */ register int temp; /* temp for swap */ register int swapflag; /* did a swap occur? */ static char sword[MWSIZE]; /* return buffer */ /* copy the word to the return buffer */ strcpy(sword, word); /* bubble sort it */ for (i = strlen(sword) - 1; i > 0; i--) { /* scan through the list once */ swapflag = FALSE; for (j = 0; j < i; j++) { /* compare */ if (sword[j] > sword[j + 1]) { temp = sword[j]; sword[j] = sword[j + 1]; sword[j + 1] = temp; swapflag = TRUE; } } /* if no movement occurred, we are done early */ if (swapflag == FALSE) break; } return(sword); } /* intersect: find all letters occuring in 2 words */ char *intersect(word1, word2) char *word1; /* first word to intersect with */ char *word2; /* second word */ { register char *iptr; /* ptr into return buffer */ static char rbuf[MWSIZE]; /* return buffer */ /* scan through both words */ iptr = rbuf; while (*word1 && *word2) { /* if they are the same, record it */ if (*word1 == *word2) { *iptr = *word1++; ++word2; /* skip duplicates */ while (*word1 == *iptr) ++word1; while (*word2 == *iptr) ++word2; ++iptr; continue; } /* advance the appropriate word */ if (*word1 < *word2) ++word1; else ++word2; } /* terminate and return the intersection */ *iptr = 0; return(rbuf); } /* weight: using the common letters between two words, apply bickels weighing function to produce a sameness score */ int weight(cletters) char *cletters; /* common letters between two words */ { int result; /* resulting weighing factor */ /* scan the letters, add up the result */ result = 0; while (*cletters) result += lweight[*cletters++ - 'a']; return(result); } /* How many characters are different between these words? */ int mismatch(word1, word2) char *word1; /* first word to compare */ char *word2; /* second word to compare */ { int count; /* count of mismatched characters */ count = 0; while (*word1) { /* at the end of word2? */ if (*word2 == 0) { count += lweight[*word1-'a']; ++word1; } else { /* a match, advance */ if (*word1 == *word2) { ++word1; ++word2; } else { if (*word1 < *word2) { count += lweight[*word1-'a']; ++word1; } else { count += lweight[*word2-'a']; ++word2; } } } } /* past the rest */ while (*word2) { ++count; count += lweight[*word2-'a']; ++word2; } return(count); } init_match() { register int index; for (index = 0; index < max_matches; index++) { matches[index][0] = 0; match_scores[index] = 1000; } } add_match(word, score) char *word; /* word to add to match list */ int score; /* score of that word */ { register int index; /* ptr into match list */ register int rest; /* ptr into rest of list to move */ for (index = 0; index < max_matches; index++) { if (match_scores[index] > score) { for (rest = max_matches; rest > index; --rest) { match_scores[rest] = match_scores[rest-1]; strcpy(matches[rest], matches[rest-1]); } match_scores[index] = score; strcpy(matches[index], word); return; } } } show_match() { register int index; register int split; if (swterse == FALSE) { for (index = 0; index < max_matches; index++) { if (match_scores[index] > 999) return; if (swverbose) printf("[%u] %s\n", match_scores[index], matches[index]); else printf("%s\n", matches[index]); } } else { split = (max_matches + 2) / 3; for (index = 0; index < split; index++) { printf("%20s %20s %20s \n", matches[index], matches[index + split], matches[index + split * 2]); } } }